Context:
What I am hoping to achieve with the project?
Key points:
——————————————- ** add a screenshot of the app **
Disclaimer: I am in no shape or form affiliated with Toggl. I started using it a few years ago because I loved its minimalistic design and yet it provided all the functionality I needed. I am using it on a free membership basis.
knitr::opts_chunk$set(
echo = TRUE, # show all of the code
tidy = FALSE, # cleaner code printing
size = "small", # smaller code
fig.path = "figs/",# where the figures will end up
out.width = "100%",
message = FALSE,
warning = FALSE
)timeline <- raw_data%>%
group_by(Month_format)%>%
summarise(Total_Duration = sum(Duration)/60)%>%
mutate(Total_Duration2 = cumsum(Total_Duration),
max = as.integer(max(Total_Duration2)),
max = ifelse(max > Total_Duration2, "", max))
#correct exam dates
#can be automated ifelse?
ggplot(timeline, aes(Month_format, Total_Duration2, group = 1))+
geom_line(size = 2, color = "#69b3a2")+
geom_point(size = 5, color = "#69b3a2")+
geom_area(alpha = 0.3, fill = "#69b3a2")+
#grade 3
geom_point(x="Oct\n '19", y = 300+393.28333, size = 5, color = "dark red")+
geom_text(x="Oct\n '19", y = 300+443.28333, label = "Grade 3")+
#grade 5
geom_point(x="Oct\n '20", y = 300+795.86667, size = 5, color = "dark red")+
geom_text(x="Oct\n '20", y = 1140.86667, size = 5, label = "Grade 5")+
geom_text(x="Oct\n '20", y = 300+745.86667, size = 5, label = "840 hours")+
# NOW
geom_point(aes(x="Apr\n '21", y = 1219), size = 5, color = "dark red")+
geom_text(aes(label = max), nudge_y = 75, nudge_x = -0.5, size = 5)+
scale_fill_gradient(low="yellow", high="red")+
labs(x = NULL,
title = "Piano practice timeline")+
theme_ipsum_es()+
theme(legend.position = "top")raw_data%>%
filter(Date_Start > as.Date("2018/11/01"))%>%
group_by(Project, Date_Start)%>%
summarise(Duration = sum(Duration)/60)%>%
mutate(Cumulative_Piece = cumsum(Duration),
Month_Year = as.factor(as.yearmon(Date_Start)),
Month_format = str_replace(Month_Year, " 20", "\n '"))%>%
ungroup()%>%
mutate(Cumulative_Total = cumsum(Duration))%>%
filter(Project %notin% c("Technique", "General", "Sightreading"))%>%
left_join(model_data%>%select(Level, Project, ABRSM), by = "Project")%>%
#fix letter issue UTC
ggplot(aes(Date_Start, Cumulative_Piece, fill = Level)) +
geom_point(size = 10, shape = 21, col = "black", alpha = 0.5) +
scale_size(range = c(.1, 16), guide = FALSE) +
#geom_text(aes(x = as.Date("2020-05-01"), y = 40, label = Month_Year), size = 15, color = 'lightgrey', family = 'Oswald') +
labs(title = 'Year: {frame_time}',
y = "Total practice time per piece (hours)")+
scale_color_tron()+
scale_fill_tron()+
theme_ipsum_es() +
theme(legend.position = "top")+
transition_time(Date_Start) +
ease_aes('linear')+
exit_fade() +
shadow_mark(alpha = 0.1, size = 5)#save animation as gif for later use
anim_save("figs/timeline.gif")
#make geoms persist https://stackoverflow.com/questions/63913059/is-there-a-way-to-make-geoms-fade-but-persist-in-gganimateGenerally, I’ve done pretty well to maintain a high level of consistency with the exception of August/December, whenever I go on Annual Leave.
raw_data%>%
filter(Source != "Estimated")%>%
group_by(Month_Year, Month_Start, Month_format)%>%
summarise(Days_Practice = n_distinct(Date_Start),
Total_Duration = sum(Duration, na.rm = TRUE))%>%
mutate(Days_Total = days_in_month(Month_Start),
Days_Not_Practiced = Days_Total - Days_Practice,
Avg_Duration = as.integer(Total_Duration/Days_Total),
Consistency = round(Days_Practice / Days_Total * 100,2),
Consistency_Status = ifelse(Consistency<75, "Bad", "Good"),
Month_format = reorder(Month_format, Month_Year))%>%
ggplot(aes(Month_format, Consistency, fill = Consistency_Status))+
geom_col(group = 1, col = "black")+
geom_hline(yintercept = 75, lty = "dashed")+
geom_text(aes(label = Days_Not_Practiced), size = 5, nudge_y = 3)+
labs(x = NULL,
fill = "Consistency status",
subtitle = "Numbers indicate days without any practice within each month")+
scale_color_tron()+
scale_fill_tron()+
theme_ipsum_es()+
theme(legend.position = "top")We can see that these were correlated with the consistency, where the average session was much shorter in the months I was away from the piano. There’s also a trend where my practice close to an exam session was significantly higher than any other time of the year. Can you spot in which month I had my exam in 2019? What about 2020?
average practice length per month includes the days in which I did not practice
raw_data%>%
filter(Source != "Estimated")%>%
group_by(Month_Year, Month_Start, Month_format)%>%
summarise(Days_Practice = n_distinct(Date_Start),
Total_Duration = sum(Duration))%>%
mutate(Days_Total = days_in_month(Month_Start),
Avg_Duration = as.integer(Total_Duration/Days_Total),
Avg_Duration_Status = ifelse(Avg_Duration < 60, "Less than one hour", "One hour"),
Month_format = reorder(Month_format, Month_Year))%>%
ggplot(aes(Month_format, Avg_Duration, fill = Avg_Duration_Status))+
geom_col(col = "black")+
labs(x = NULL,
y = "Average practice session length (min)",
fill = "Status")+
geom_text(aes(label = Avg_Duration), nudge_y = 5, size = 5)+
scale_color_tron()+
scale_fill_tron()+
theme_ipsum_es()+
theme(legend.position = "top",
axis.text.y = element_blank(),
axis.ticks.y = element_blank())Similar trends as before are apparent where my average daily session is longer before the exams than any other time in the year and a dip in the months where I usually take most of my annual leave. I really do need to start picking up the pace and get back to where I used to be.
raw_data%>%
group_by(Month_Year, Month_Start, Month_format, Month_Name, Year)%>%
summarise(Days_Practice = n_distinct(Date_Start),
Total_Duration = sum(Duration))%>%
mutate(Days_Total = days_in_month(Month_Start),
Avg_Duration = as.integer(Total_Duration/Days_Total),
Avg_Duration_Status = ifelse(Avg_Duration < 60, "Less than one hour", "One hour"),
Month_format = reorder(Month_format, Month_Year),
size = as.factor(ifelse(Year == 2018, 1, 1.5)),
label = ifelse(month(Month_Start) == 1, as.character(Year), ""))%>%
ggplot(aes(Month_Name, Avg_Duration, group = Year, size = size))+
geom_line(aes(col = Year))+
geom_label_repel(aes(label = label, col = Year))+
labs(x = NULL,
fill = "Status")+
scale_color_tron()+
scale_fill_tron()+
theme_ipsum_es()+
theme(legend.position = "none")Despite a similar median, we can see that there was a reduction towards the 75th percentile of my practice sessions. We can test if this was a significant impact with a t-test.
covid_start <- as.Date("2020/03/23")
inference <- raw_data%>%
filter(Source != "Estimated")%>%
mutate(Covid_Status = as.factor(ifelse(Date_Start < covid_start, "Before COVID", "After COVID")),
Covid_Status = reorder(Covid_Status, desc(Covid_Status)))%>%
group_by(Covid_Status, Date_Start)%>%
summarise(Duration = sum(Duration))%>%
ungroup()
ggplot(inference, aes(Covid_Status, Duration, fill = Covid_Status))+
geom_boxplot(varwidth = TRUE, col = "black")+
labs(x = NULL,
y = "Average practice session (min)")+
scale_color_tron()+
scale_fill_tron()+
theme_ipsum_es()+
theme(legend.position = "none")Given the extremely low p-value, the Shapiro-Wilk normality test implies that the distribution of the data is significantly different from a normal distribution and that we cannot assume the normality. However, we’re working with the entire population dataset for each class and thus, unlike the independence of data, this assumption is not crucial.
inference %>%
select(Covid_Status, Duration) %>%
group_by(group = as.character(Covid_Status)) %>%
do(tidy(shapiro.test(.$Duration)))%>%
kbl(caption = "Shapiro-Wilk normality test")%>%
kable_paper("hover", full_width = F)| group | statistic | p.value | method |
|---|---|---|---|
| After COVID | 0.9607325 | 3e-07 | Shapiro-Wilk normality test |
| Before COVID | 0.9549818 | 0e+00 | Shapiro-Wilk normality test |
We can see that with a large p value, we should fail to reject the Null hypothesis (Ho) and conclude that we do not have evidence to believe that population variances are not equal and use the equal variances assumption for our t test
tidy(leveneTest(inference$Duration~inference$Covid_Status))%>%
kbl(caption = "Levene's test")%>%
kable_paper("hover", full_width = F)| statistic | p.value | df | df.residual |
|---|---|---|---|
| 0.0410026 | 0.8395891 | 1 | 732 |
My practice sessions After COVID are significantly shorter than those before the pandemic. This might be surprising, given that we were in the UK most of the time. However, I’ve been spending my time doing a few other things such as improving my technical skillset with R (this analysis wouldn’t have been possible otherwise) and learning italian.
t_test <- inference%>%
t_test(Duration ~ Covid_Status, var.equal = TRUE)%>%
add_significance()%>%
kbl()%>%
kable_paper("hover", full_width = F)
t_test| .y. | group1 | group2 | n1 | n2 | statistic | df | p | p.signif |
|---|---|---|---|---|---|---|---|---|
| Duration | Before COVID | After COVID | 433 | 301 | 3.319481 | 732 | 0.000947 | *** |
#write these as a function
#remove axis
raw_data%>%
group_by(Genre)%>%
summarise(Duration = as.integer(sum(Duration)/60))%>%
mutate(Genre = reorder(Genre, Duration))%>%
arrange(desc(Duration))%>%
filter(Genre %notin% c("Other", "Not applicable"))%>%
head(10)%>%
ggplot(aes(Genre, Duration, fill = Duration))+
geom_col(show.legend = FALSE, col = "black", width = 1)+
geom_text(aes(label = Duration), show.legend = FALSE, nudge_y = 25)+
scale_fill_gradient(low="yellow", high="red")+
labs(x = NULL,
y = "Total hours of practice",
subtitle = "*Not applicable* - unclassified practice and *Other* - sight reading + technique practice")+
coord_flip()+
theme_ipsum_es()raw_data%>%
filter(Composer != "Not applicable")%>%
group_by(Composer)%>%
summarise(Duration = as.integer(sum(Duration)/60))%>%
mutate(Composer = reorder(Composer, Duration))%>%
arrange(desc(Duration))%>%
head(10)%>%
ggplot(aes(Composer, Duration, fill = Duration))+
geom_col(show.legend = FALSE, col = "black", width = 1)+
geom_text(aes(label = Duration), show.legend = FALSE, nudge_y = 6)+
scale_fill_gradient(low="yellow", high="red")+
labs(x = NULL,
y = "Total hours of practice",
subtitle = "*Not applicable* - unclassified practice and *Other* - sight reading + technique practice")+
coord_flip()+
theme_ipsum_es()raw_data%>%
group_by(Project)%>%
summarise(Duration = as.integer(sum(Duration)/60))%>%
mutate(Project = reorder(Project, Duration))%>%
arrange(desc(Duration))%>%
filter(Project %notin% c("Technique", "General", "Sightreading"))%>%
head(15)%>%
ggplot(aes(Project, Duration, fill = Duration))+
geom_col(show.legend = FALSE, col = "black", width = 1)+
geom_text(aes(label = Duration), show.legend = FALSE, nudge_y = 2)+
scale_fill_gradient(low="yellow", high="red")+
labs(x = NULL,
y = "Total hours of practice",
title = "Top 15 pieces by hours of practice")+
coord_flip()+
theme_ipsum_es()# think about gtable?
# add stuff (links to videos)
# (classify as green/red/yellow categories) and link to video where possible
model_data%>%
select(-Days_Practiced, Standard, -Length)%>%
mutate(Duration = round(Duration))%>%
arrange(desc(Date_Start))%>%
kbl(escape = FALSE,
caption = "test")%>%
kable_paper(c("hover", "striped"), full_width = F)%>%
column_spec(c(1,3), bold = T, color = "black")%>%
scroll_box(height = "450px")| Project | Genre | Duration | Date_Start | Date_End | ABRSM | Standard | Level | Cumulative_Duration |
|---|---|---|---|---|---|---|---|---|
| Elton John - Rocket man | Modern | 47 | 2020-12-08 | 2021-04-15 | 7 | Performance | Advanced | 46.600000 |
| Schumann - Träumerei | Romantic | 14 | 2020-11-09 | 2020-12-05 | 7 | Average | Advanced | 14.200000 |
| Mozart - Allegro (3rd movement) K282 | Classical | 28 | 2020-11-05 | 2021-04-18 | 6 | Average | Intermediate | 27.566667 |
| Ibert - Sérénade sur l’eau | Modern | 10 | 2020-09-24 | 2020-10-27 | 6 | Performance | Intermediate | 10.383333 |
| Kuhlau - Rondo Vivace | Classical | 24 | 2020-08-03 | 2020-10-27 | 6 | Average | Intermediate | 24.066667 |
| C. Hartmann - The little ballerina | Romantic | 21 | 2020-07-14 | 2020-10-27 | 6 | Performance | Intermediate | 21.066667 |
| Schumann - Lalling Melody | Romantic | 5 | 2020-06-28 | 2020-08-14 | 1 | Average | Beginner | 4.633333 |
| Schumann - Melody | Romantic | 4 | 2020-06-20 | 2020-07-22 | 1 | Average | Beginner | 4.066667 |
| Clementi - Sonatina no 3 - Mov 2 | Classical | 3 | 2020-06-04 | 2020-06-24 | 1 | Performance | Beginner | 3.116667 |
| Clementi - Sonatina no 3 - Mov 3 | Classical | 20 | 2020-06-04 | 2020-07-11 | 4 | Performance | Beginner | 20.333333 |
| Chopin - Waltz in Fm | Romantic | 27 | 2020-04-18 | 2020-10-27 | 6 | Performance | Intermediate | 27.416667 |
| Clementi - Sonatina no 3 - Mov 1 | Classical | 30 | 2020-04-07 | 2020-06-05 | 4 | Performance | Beginner | 29.533333 |
| Schumann - Kinderszenen 1 | Romantic | 10 | 2020-03-25 | 2020-04-18 | 5 | Average | Intermediate | 9.866667 |
| Bach - Prelude in G from Cello Suite No 1 | Baroque | 25 | 2020-02-04 | 2020-04-10 | 5 | Average | Intermediate | 24.733333 |
| Georg Böhm - Minuet in G | Baroque | 7 | 2020-01-27 | 2020-04-18 | 1 | Average | Beginner | 6.516667 |
| Bach - Invention 4 in Dm | Baroque | 21 | 2020-01-25 | 2020-03-26 | 5 | Performance | Intermediate | 20.666667 |
| Chopin - Contredanse in Gb | Romantic | 23 | 2020-01-16 | 2020-03-21 | 6 | Performance | Intermediate | 22.916667 |
| Bach - Minuet in Gm - 115 | Baroque | 7 | 2020-01-07 | 2020-02-01 | 1 | Average | Beginner | 6.833333 |
| Elton John - Your song (Arr Cornick) | Modern | 36 | 2019-11-21 | 2020-02-07 | 5 | Performance | Intermediate | 35.683333 |
| Poulenc - Valse Tyrolienne | Modern | 17 | 2019-09-02 | 2019-11-07 | 5 | Performance | Intermediate | 16.800000 |
| Bach - Prelude in Cm - 934 | Baroque | 25 | 2019-08-15 | 2019-09-29 | 1 | Performance | Beginner | 24.950000 |
| Schumann - Volksliedchen | Romantic | 10 | 2019-07-01 | 2019-07-28 | 2 | Average | Beginner | 9.750000 |
| Haydn - Andante in A | Classical | 39 | 2019-06-08 | 2019-11-07 | 5 | Average | Intermediate | 39.033333 |
| Schumann - Remembrance | Romantic | 34 | 2019-04-28 | 2019-11-07 | 5 | Performance | Intermediate | 34.050000 |
| Chopin - Waltz in Am | Romantic | 26 | 2019-01-07 | 2019-11-25 | 4 | Performance | Beginner | 26.116667 |
I define them as beginner, intermediate and advanced between the 8 grades. In reality, there are diploma, equivalent to university degrees but that’s beyond the scope of the analysis (worthwhile returning in 5 years).
Simplified, ABBRSM grades are a group of 8 graded exams based on their difficulty (1 - beginner to 8 - advanced). There’s also diploma grades but those are extremely advanced, equivalent of university level studies and out of the scope of this analysis.
More information can be found on their official website at https://gb.abrsm.org/en/exam-support/your-guide-to-abrsm-exams/
model_data%>%
mutate(Duration = Duration)%>%
ggplot(aes(ABRSM, Duration, fill = ABRSM))+
geom_boxplot(outlier.shape = NA)+
geom_jitter(width = 0.1, height = 0.1, alpha = 0.5)+
labs(x = "ABRSM Grade",
y = "Total practice hours")+
scale_color_tron()+
scale_fill_tron()+
theme_ipsum_es()+
theme(legend.position = "none")A further aggregration of Grades; this is helpful given the very limited dataset. This is an oversimplification but they’re classified as: * 1-5: Beginner (1) * 5-6: Intermediate (2) * 7-8: Advanced (3)
model_data%>%
mutate(Duration = Duration)%>%
ggplot(aes(Level, Duration, fill = Level))+
geom_boxplot(aes(outlier.shape = NA))+
geom_jitter(width = 0.2, height = 0.2)+
scale_color_tron()+
scale_fill_tron()+
labs(x = "Level",
y = "Total practice hours")+
theme_ipsum_es()+
theme(legend.position = "none")We can spot a trend where the time required to learn a piece of a similar difficulty (ABRSM Grade) decreases as my ability to play the piano increases (as judged by cumulative hours of practice), for ABRSM grades with a significant sample size. We should keep this in mind and include it as a variable into our prediction model.
model_data%>%
ggplot(aes(Cumulative_Duration, Duration, col = Level))+
geom_point()+
geom_smooth(method = "lm", se=FALSE)+
facet_wrap(.~ABRSM)+
labs(x = "Cumulative hours practiced",
y = "Hours needed to learn a piece")+
scale_color_tron()+
scale_fill_tron()+
theme_ipsum_es()+
theme(legend.position = "none")How do we differentiate between pieces that we learn once and those that we come back to repeatedly? Examples could include wanting to improve the playing further, loving it so much we wanted to relearn it, preparing it for a new performance, etc.
As anyone that ever played the piano knows, re-learning a piece, particularly after you “drop” it for a few months/years, results in a much better performance/understanding of the piece. I definitely found that to be true in my experience, particularly with my exam pieces.
The downside (as it comes to modelling an algorithm) is that these pieces take longer to learn. Not only that you play a piece past the point where you “learn’t” it but not playing a piece for a while will result in losing the skill required to play it. As a result, you both have to “catch up” to where you were as well as supplement past that point. If we don’t include this as a variable in our model, some pieces will show up as requiring more time to learn than required.
Knowing my practice habits/myself, I never practice a piece for more than 3-4 months without a longer break in between so I chose 130 days as a “delimiter”. Looking at those outside this range, I can see that those highlighter were all pieces I came back to rather than pieces I practiced continuously. Unsurprisingly, they are placed towards the higher end of total time practiced/piece of each grade.
This is a simplification of the whole aspect (people literally have PhDs on the topic of spatial repetition) so it cannot be all summarised in a simple variable. However, I thought this simplified version will be better than nothing.
model_data%>%
mutate(Project_formatted = str_replace_all(Project,"[^[:graph:]]", " "),
Project_label = ifelse(Days_Practiced > 130, Project_formatted, ""))%>%
ggplot(aes(Days_Practiced, Duration, col = Days_Practiced > 130))+
geom_point()+
geom_vline(xintercept = 130, col = "gray", lty = "dashed")+
geom_smooth(method = "lm", se=FALSE)+
geom_text_repel(aes(label = Project_label), size = 3)+
facet_wrap(.~ABRSM)+
scale_color_tron()+
scale_fill_tron()+
labs(x = "Days passed since I the first time I started a piece (to the last practice session)",
y = "Hours needed to learn a piece")+
theme_ipsum_es()+
theme(legend.position = "none")Question: How long would it take to practice a piece based on various factors?
This is particularly important given the small dataset, where one outlier could significantly impact the models.
# graph the other variables in the model above?
model_data%>%
mutate(Duration2 = mean(Duration),
mx = Duration2 - Duration)%>%
ggplot(aes(mx))+
geom_histogram()# to find boxplot, scatterplot, z-score, IQR score
# to handle: cap at some treshold, transformations to reduce skewness (BoxCox), remove outliers only if they are anomalies or errors
# remove Elton John, remove Bach (restudied but only tracked the second time, remove Clementi extremely small with repeat)The dataset does not have any missing values. The pieces that did not have an official ABRSM grading were given one based on searching for opinions online and consulting with my teacher.
#regVar <- c("Days_Practiced", "Length", "Cumulative_Duration")
# featurePlot(x = model_data[, regVar],
# y = model_data$Duration,
# col = model_data$Level,
# plot = "scatter",
# type = c("p", "smooth"),
# span = .5,
# layout = c(3, 1))
# nearZeroVar(model_data)
# categorical variable can be seen as continuous only when it is ordinal in nature
# max time between two different sessions on the same piece exceeds a limit
#binarisation of data (ie. Standard)
# VIF formulticollinearity
# regularisationLet’s use some basic standardisation offered by the caret package such as centering (subtract mean from values) and scaling (divide values by standard deviation).
Given the small size of the dataset, bootstrapping resampling method will be applied.
train.control <- trainControl(method = "boot",
number = 10,
search = "random")
#These variables have zero variances: GenreNot applicable, GenreOthermodel_data <- model_data%>%filter(Project != "Elton John - Rocket man")
clusters <- 4
#run them all in paralel
cl <- makeCluster(clusters, type = "SOCK")
#register cluster train in paralel
registerDoSNOW(cl)
#train models
model <- train(Duration ~ ABRSM + Genre + Length + Cumulative_Duration + ifelse(Days_Practiced>130, 2, 1) + Standard,
data = model_data,
method = "ranger",
preProcess = c("center", "scale", "BoxCox"),
tuneLength = 100,
trControl = train.control)
model2 <- train(Duration ~ ABRSM + Genre + Length + Cumulative_Duration + ifelse(Days_Practiced>130, 2, 1) + Standard,
data = model_data,
method = "lmStepAIC",
preProcess = c("center", "scale", "BoxCox"),
tuneLength = 100,
trControl = train.control)
model3 <- train(Duration ~ ABRSM + Genre + Length + Cumulative_Duration + ifelse(Days_Practiced>130, 2, 1) + Standard,
data = model_data,
method = "lm",
preProcess = c("center", "scale", "BoxCox"),
tuneLength = 100,
trControl = train.control)
model4 <- train(Duration ~ ABRSM + Genre + Length + Cumulative_Duration + ifelse(Days_Practiced>130, 2, 1) + Standard,
data = model_data,
method = "gbm",
preProcess = c("center", "scale", "BoxCox"),
tuneLength = 100,
trControl = train.control)
model5 <- train(Duration ~ ABRSM + Genre + Length + Cumulative_Duration + ifelse(Days_Practiced>130, 2, 1) + Standard,
data = model_data,
method = "rf",
preProcess = c("center", "scale", "BoxCox"),
tuneLength = 100,
trControl = train.control)
model6 <- train(Duration ~ ABRSM + Genre + Length + Cumulative_Duration + ifelse(Days_Practiced>130, 2, 1) + Standard,
data = model_data,
method = "gbm",
preProcess = c("center", "scale", "BoxCox"),
tuneLength = 100,
trControl = train.control)
#repeat
model7 <- train(Duration ~ ABRSM + Genre + Length + Cumulative_Duration + Days_Practiced + Standard,
data = model_data,
method = "gbm",
preProcess = c("center", "scale", "BoxCox"),
tuneLength = 100,
trControl = train.control)
#shut the instances of R down
stopCluster(cl)
#compare models
model_list <- list(one = model, two = model2, three = model3, four = model4, five = model5, six = model6, seven = model7)
model_comparison <- resamples(model_list)
summary(model_comparison)
# Estimate accuracy based on different groups? why does the model perform badly there
# keep LM model for explanation or even RF
# correlation
# learning curves to indicate overfitting and underfitting
# transform days_practiced into something more like 1-2-3 based on 120 days? why is chopin so high
# hyper parameters
# https://topepo.github.io/caret/model-training-and-tuning.html#model-training-and-parameter-tuning
# https://topepo.github.io/caret/random-hyperparameter-search.htmlWe chose the Random Forest model as it was the best performing model. It is known as a model which is:
selected_model <- model5
#Saving the model
saveRDS(selected_model, file = "model.rda")
#get predictions
predictions <- predict(selected_model, model_data)
#create dataset
model_data2 <- model_data
model_data2$Predicted <- predictions
model_data2$Actual <- model_data$Duration
model_data2$Residuals <- model_data2$Actual - model_data2$Predicted
# model_data2 <- model_data%>%
# mutate(Actual = as.numeric(Duration),
# Predicted = as.numeric(predictions),
# Residuals = Actual - Predicted)%>%
# select(Predicted, Actual, Residuals, Project, Level, Genre)
#visualise predicted vs actual
ggplotly(
ggplot(model_data2, aes(Predicted, Actual, label = Residuals, col = Level))+
geom_point(aes(text = Project))+
geom_smooth(method = "loess", col = "red", lwd = 1, se = FALSE)+
geom_abline(lty = "dashed", lwd = 0.5, col = "gray")+
coord_cartesian(xlim = c(0,50), ylim = c(0,50))+
labs(col = NULL)+
scale_color_tron()+
theme_ipsum_es() +
theme(legend.position = "top")
) %>%
layout(legend = list(
orientation = "h",
x = 0.4, y = 1.2))We can see that the residuals are mostly situated around 0.
ggplot(model_data2, aes(Residuals, fill = ..count..))+
geom_histogram(binwidth = 1, col = "black")+
geom_vline(aes(xintercept=mean(Residuals)), lwd = 1, lty = 2) +
labs(x="Residuals",
y= "Total occurences")+
scale_fill_gradient(low="yellow", high="red")+
theme_ipsum_es()+
theme(legend.position = "none")Lastly, we can see that there is a constant variability of errors. However, there is still a tendency to underpredict for pieces that took very little and over predict required time for pieces that took longer than necessary.
ggplotly(
ggplot(model_data2, aes(Actual, Residuals, col = Level, label = Predicted))+
geom_hline(yintercept = 0, size = 3, color = "grey52")+
geom_point(aes(text = Project), alpha = 0.5)+
geom_smooth(method = "loess", col = "red", se = FALSE)+
labs(col = NULL)+
scale_color_tron()+
theme_ipsum_es()
) %>%
layout(legend = list(orientation = "h",x = 0.4, y = 1.2))tidy(compare_models(model4, model5))%>%
kbl(caption = "Model 1 vs model 2")%>% #change this
kable_paper("hover", full_width = F)| estimate | statistic | p.value | parameter | conf.low | conf.high | method | alternative |
|---|---|---|---|---|---|---|---|
| 1.546428 | 2.879039 | 0.0182065 | 9 | 0.3313476 | 2.761508 | One Sample t-test | two.sided |
These results also confirm that the Random Forrest model is significantly better than the other two.
plot(model5,
main = "The most optimal model was that with 5 predictors", col = "orange", lwd = 3)imp <- varImp(model5)
ggplot(imp, size = 6)+
geom_col(col = "white", fill = "white")+
geom_segment(aes(Feature, y = 0, xend = Feature, yend = Importance), col = "black", size = 1) +
geom_point(size = 8, col = "orange")+
geom_text(aes(label = paste(round(Importance), "%", sep = "")), color = "black", size = 3, check_overlap = TRUE)+
scale_color_tron()+
scale_fill_tron()+
theme_ipsum_es()+
labs(title = "Variable importance ranking")+
theme(axis.text.x = element_blank(),
axis.ticks = element_blank())#compare with linear regressionWhat’s next?